import jieba
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
import numpy as np
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE


# 读取数据
def loadArticle(fileName):
	'''
	读取原始自用数据集的测试文章
	:param fileName: 文件名
	:return: 处理之后的文章
	'''
	# 我们需要将其空格去掉
	with open(fileName, encoding='utf-8') as file:
		# 按行读取
		test_article = []
		for line in file.readlines():
			# 去除空格，以及换行符
			line = line.replace("<content>", "")
			line = line.replace("</content>", "")
			line = line.replace(" ", "")
			line = line.replace("　", "")
			line = line.strip()
			test_article.append(line)
	return test_article

def test(file_path,n_clusters):
	test_article = loadArticle(file_path)
	texts = []

	for text in test_article:
		# 数据预处理
		stop_words = set()
		with open('cluster/stopwords.txt', encoding='utf-8') as f:
			for line in f:
				stop_words.add(line.strip())
		text = ' '.join([word for word in jieba.cut(text) if word not in stop_words])
		texts.append(text)

	# 文本表示
	vectorizer = CountVectorizer(max_features=1000)
	X = vectorizer.fit_transform(texts)

	# 聚类算法
	cluster = KMeans(n_clusters=n_clusters,init='k-means++',  n_init=10, random_state=0)
	cluster.fit(X.toarray())

	# 输出聚类结果
	print('Cluster labels:', cluster.labels_)
	# 将不同类的文本保存到不同的 txt 文件中
	for i in range(cluster.n_clusters):
		with open('cluster/k_means/cluster_{}.txt'.format(i), 'w', encoding='utf-8') as f:
			for j in range(len(texts)):
				if cluster.labels_[j] == i:
					f.write('{}\n'.format(texts[j]))
	out_top_words = ""
	# 对每个类总结出关键词
	for i in range(cluster.n_clusters):
		cluster_texts = []
		with open('cluster/k_means/cluster_{}.txt'.format(i), 'r', encoding='utf-8') as f:
			for line in f:
				cluster_texts.append(line.strip())
		cluster_words = []
		for text in cluster_texts:
			words = list(jieba.cut(text))
			words = [word for word in words if word not in stop_words and len(word) > 1]
			cluster_words.extend(words)
		word_count = {}
		for word in cluster_words:
			if word in word_count:
				word_count[word] += 1
			else:
				word_count[word] = 1
		sorted_words = sorted(word_count.items(), key=lambda x: x[1], reverse=True)
		top_words = [w[0] for w in sorted_words[:10]]
		print('k_means/Cluster {}: {}'.format(i, ', '.join(top_words)))
		out_top_words = out_top_words+'k_means/Cluster {}: {}'.format(i, ', '.join(top_words))+"\n"
	return out_top_words
